In [1]:
%matplotlib inline
In [2]:
#from __future__ import division
import pandas as pd
import numpy as np
from altair import Chart
In [3]:
!ls -lah ../data/*csv
In [4]:
offsets = [150,200,300]
winsizes = [50,80,100,200]
output_tpl = '../data/dfa_mp.offset_{}.win_{}.csv'
output = []
for offset in offsets:
for winsize in winsizes:
df = pd.DataFrame.from_csv(output_tpl.format(offset, winsize))
df['win'] = winsize
df['offset'] = offset
output.append(df)
dfa = pd.concat(output)
In [5]:
dfa['UTR_length'] = dfa['end_x'] - dfa['start_x']
dfa
Out[5]:
In [6]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['ratio_ATCACG'] > 2)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][['UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])
In [7]:
d.shape
Out[7]:
In [18]:
d
Out[18]:
In [28]:
from copy import deepcopy
import statsmodels.api as sm
import altair
def linear_regression(x, y):
p = np.polyfit(x, y, 1)
return np.polyval(p, x)
def lowess(x, y):
return sm.nonparametric.lowess(y, x, frac=1/7, return_sorted=False)
def rmean(x, y):
win = y.shape[0] // 20
return y.rolling(center=True, window=win).mean()
class RegressionChart(altair.Chart):
@staticmethod
def _add_regression_column(group, regression_func, x, y, yfit):
group[yfit] = regression_func(group[x], group[y])
return group
def regression_plot(self, func=linear_regression, **kwargs):
if not isinstance(self.data, pd.DataFrame):
raise ValueError("data must be a DataFrame")
points = self.mark_point()
lines = deepcopy(self).mark_line()
encoding = points.encoding.to_dict()
if any(enc.get('bin', False) for enc in encoding.values()):
raise ValueError("regress() cannot handle binned variables")
group_cols = [enc['field'] for key,enc in encoding.items()
if key not in ['x', 'y']]
x = encoding['x']['field']
y = encoding['y']['field']
yfit = y + '_fit'
lines.encode(y=yfit)
if group_cols:
groups = self.data.groupby(group_cols)
data = groups.apply(self._add_regression_column, regression_func=func,
x=x, y=y, yfit=yfit)
else:
data = self._add_regression_column(self.data.copy(),
regression_func=func,
x=x, y=y, yfit=yfit)
return altair.LayeredChart(data).set_layers(points, lines)
In [30]:
from altair import X, Y, Scale
RegressionChart(d).mark_circle().encode(
X('loglen:Q', scale=Scale(domain=(1.6, 3))),
y='log-bcm'
).regression_plot(func=lowess)
In [17]:
from altair import X, Y, Scale
RegressionChart(d).mark_circle().encode(
X('loglen:Q', scale=Scale(domain=(1.6, 3))),
y='log-bcm'
).regression_plot(func=linear_regression)
In [ ]: